/* This dofile is to clean and match 295 new follow-up survey data for grade 12 students with the current database */

// import dataset
import excel "/Users/ye/Downloads/Follow-up_questions_on_final_decisions_-_Grade_12_-_all_versions_-_labels_-_2022-09-16-23-45-59.xlsx", clear firstrow

g indexnumber = _n

ren Studentcode studentid 
ren Schoolname schoolname 
ren Studentname name
ren CitizenshipIDNo citizenid
ren Didthestudentapplyforcol i_appcollege
ren stchoice i_appcollege_major1
ren ndchoice i_appcollege_major2
ren rdchoice i_appcollege_major3
ren ifothersPleasespecify i_appcollege_major_others

ren Wasthestudentadmittedto i_admcollege
ren Whichcollegemajorwasthe i_admcollege_major
ren S i_admcollege_major_others

ren Didthestudentapplyforte i_apptvet
ren U i_admtvet

ren Whatwasthestudentsfinal i_findecision
ren ifotherspleasespecify i_findecision_others
ren NameofthecollegeorTTII i_institute
ren IfOthersPleaseSpecify i_institute_others

rename *, lower   // three repeated var

replace citizenid = "" if length(citizenid)~=11
replace citizenid = "" if citizenid == "UNDER PROCE"
destring citizenid, replace

tostring citizenid, replace format("%20.0f")

replace schoolname = trim(itrim(lower(schoolname)))
tab schoolname

replace name = trim(itrim(lower(name)))
replace name = subinstr(name," ","",.)  
replace name = subinstr(name,",","",.)   //Removes comma (,)
replace name = subinstr(name,"'","",.)   //Removes apostrophe (')
replace name = subinstr(name,".","",.)   //Removes dot (.) 
replace name = subinstr(name,"/","",.)   //Removes slash (/)
replace name = subinstr(name,"-","",.)   //Removes dash (-)
replace name = subinstr(name,"=","",.)   //Removes dash (-)
replace name = subinstr(name,"(","",.)   //Removes opening parentheses
replace name = subinstr(name,")","",.)   //Removes closing parentheses

gen dateofbirth1 = date(dateofbirth, "DMY")
format dateofbirth1 %td
g b_month = month(dateofbirth1)
g b_day = day(dateofbirth1)
g b_year = year(dateofbirth1)

g sex = 1 if gender == "Male"
replace sex = 2 if gender == "Female"
replace sex = 3 if gender == "Other "

drop dateofbirth dateofbirth1

g double dateofbirth = b_year*10^4 + b_month*10^2 + b_day
format dateofbirth %20.0f 
tostring dateofbirth, replace 

// correct for typos: this is done after finishing step 3, then manually check 

// remove duplicated observations 
sort schoolname name studentid sex 
g dup = 0
forval i=1/3 {
replace dup = 1 if schoolname == schoolname[_n-`i'] & name == name[_n-`i'] & sex == sex[_n-`i'] & studentid==studentid[_n-`i']
}

// save the list of duplicated observations and send to Cheku
preserve 
duplicates tag schoolname name sex studentid, g(duplicatedobservations)
tab duplicatedobservations
keep if duplicatedobservations ~= 0
keep start end enumeratorid schoolname name studentid citizenid gender phonenumber
export excel using "$temp/File1_grade12_degree_data_duplicatedobservations_$date.xlsx", replace firstrow(variables)
restore 

keep if dup == 0

keep indexnumber name studentid citizenid b_* dateofbirth sex schoolname i_appcollege i_admcollege i_appcollege_major1 i_appcollege_major2 i_appcollege_major3 i_appcollege_major_others i_admcollege i_admcollege_major i_admcollege_major_others i_apptvet i_admtvet i_findecision i_findecision_others i_institute i_institute_others
foreach variable in schoolname name studentid citizenid dateofbirth sex b_month b_day b_year{
	rename `variable' `variable'_s 
}

save "$temp/tvetchoice.dta", replace 


********************************************************************************
*
*			MERGE WITH GRADE 12 SURVEY DATA 
*
********************************************************************************

// STEP 1: PERFECT MATCHING ON STUDENT ID + DOUBLE CHECK NAME 
********************************************************************************
gl criteria studentid  

* open the tvet choice data 
use "$temp/tvetchoice.dta", clear 
rename studentid_s studentid 

duplicates tag $criteria, g(temp)
tab temp

* obtain the subset of unique citizenid
preserve 
keep if temp==0
drop temp
save $temp/tvet_step1u, replace 
restore 

* keep the duplicated for next round match
keep if temp~=0
drop temp
cap rename studentid studentid_s
save $temp/tvet_step1d, replace


* open the survey data 
use "$temp/tvet_analysis.dta", clear
replace name = subinstr(name," ","",.)  
replace sex = 2 if sex == 0 
format studentid_b %20s

generate str citizenid_st = citizenid
replace citizenid = ""
compress citizenid
replace citizenid = citizenid_st
drop citizenid_st
describe citizenid

save "$temp/data12.dta", replace 


use "$temp/data12.dta", clear
rename studentid_b studentid 

duplicates tag $criteria, g(temp)

* obtain the subset of unique citizenid
preserve 
keep if temp==0
drop temp
save $temp/data12_step1u, replace 
restore 

* keep the duplicated for next round match
keep if temp~=0
drop temp
rename studentid studentid_b 
save $temp/data12_step1d, replace

* merging two unique data together using fully matched student ID & check fuzzy name 
use $temp/tvet_step1u, clear
merge 1:1 $criteria using $temp/data12_step1u
matchit name_s name, g(namescore)
matchit dateofbirth_s dateofbirth, g(bscore)
sum namescore if _merge==3
br namescore bscore if _merge==3

* correct if names are not matched, leave for the next round of matching
br if (namescore <= 0.5 & bscore < 0.8) & _merge == 3
replace _merge = . if (namescore <= 0.5 & bscore < 0.8) & _merge == 3

* keep matched data 
preserve 
keep if _merge==3 
keep indexnumber uniqueid 
unique indexnumber
save $temp/tvet_step1m, replace 
restore 

* keep unmatched data from score database, merge with the duplicated from previous step
preserve 
keep if _merge==1 | _merge==.
keep indexnumber 
merge 1:1 indexnumber using $temp/tvetchoice.dta, keep(matched) nogen 
append using $temp/tvet_step1d
save $temp/tvet_step2b, replace 
restore 

preserve 
keep if _merge==2 | _merge==.
keep responseid_b schoolname_b 
merge 1:1 responseid_b schoolname_b using $temp/data12.dta, keep(matched) nogen 
append using $temp/data12_step1d
save $temp/data12_step2b, replace 
restore 


// STEP 2: PERFECT MATCHING ON CITIZEN ID + DOUBLE CHECK NAME 
********************************************************************************
gl criteria citizenid 

* open the tvet choice data 
use $temp/tvet_step2b.dta, clear 
rename citizenid_s citizenid 

duplicates tag $criteria, g(temp)

* obtain the subset of unique citizenid
preserve 
keep if temp==0
drop temp
save $temp/tvet_step2u, replace 
restore 

* keep the duplicated for next round match
keep if temp~=0
drop temp
cap rename citizenid citizenid_s
save $temp/tvet_step2d, replace


* open the survey data 
use $temp/data12_step2b.dta, clear

duplicates tag $criteria, g(temp)

* obtain the subset of unique citizenid
preserve 
keep if temp==0
drop temp
save $temp/data12_step2u, replace 
restore 

* keep the duplicated for next round match
keep if temp~=0
drop temp
save $temp/data12_step2d, replace


* merging two unique data together using fully matched citizenship ID & check fuzzy name 
use $temp/tvet_step2u
merge 1:1 $criteria using $temp/data12_step2u
matchit name_s name, g(namescore)
matchit dateofbirth_s dateofbirth, g(bscore)
br namescore bscore if _merge==3

sum namescore if _merge==3

replace _merge = . if namescore < 0.7 & bscore < 0.9 & _merge == 3

* keep matched data 
preserve 
keep if _merge==3 
keep indexnumber uniqueid 
unique indexnumber 
save $temp/tvet_step2m, replace 
restore 

* keep unmatched data from score database, merge with the duplicated from previous step
preserve 
keep if _merge==1 | _merge==.
keep indexnumber 
merge 1:1 indexnumber using $temp/tvetchoice.dta, keep(matched) nogen 
append using $temp/tvet_step2d
save $temp/tvet_step3b, replace 
restore 

preserve 
keep if _merge==2 | _merge==.
keep responseid_b schoolname_b 
merge 1:1 responseid_b schoolname_b using $temp/data12.dta, keep(matched) nogen 
append using $temp/data12_step2d
save $temp/data12_step3b, replace 
restore  



// STEP 3: PERFECT MATCHING ON SEX + DATE OF BIRTH
********************************************************************************


// combine data together 
********************************************************************************
clear 
append using $temp/tvet_step1m
g tvet_match_criteria = "perfect(studentID) + fuzzy(name/dob)"

append using $temp/tvet_step2m
replace tvet_match_criteria = "perfect(CID) + fuzzy(name/dob)" if missing(tvet_match_criteria)

tab tvet_match_criteria

merge 1:1 indexnumber using "$temp/tvetchoice.dta", keep(matched) keepusing(i_*) nogen
merge 1:1 uniqueid using "$temp/grade12_analysis" 
g tvet_data = 1 if _merge == 3 
replace tvet_data = 0 if _merge == 2
drop _merge 
lab var tvet_data "1/0 tvet data available"

g tvet_institute = 1 if i_admtvet == "Yes" 
replace tvet_institute = 0 if i_admtvet ~= "Yes" & ~missing(i_appcollege) // ?
lab var tvet_institute "1/0 enrolled in tvet institute after grade 12"

save "$clean/grade12_analysis_survey916.dta", replace 




* clear all temporary datasets 
global tempfilelist: dir "$temp/" files "*.dta"
	foreach tfile of global tempfilelist {
	erase "$temp/`tfile'"
}

